This document contains analysis of differences in microbial alpha diversity between healthy samples from American Gut Project and samples with diagnosed Inflammatory bowel syndrome (Ulcerative colitis and Crohn’s disease) and Clostridium difficile infection, as well as samples after fecal microbiota transplantation.

The aim of this analysis is to show that there are significant differences between microbiome alpha diversity in healthy and disease samples. Furthermore, we want to show that fecal transplantation improves alpha diversity in short- and long-term.

Loading libraries:

#install.packages("readr")
library("readr")
#install.packages('data.table')
library(data.table)
library(stringr)
#install.packages('ggplot2')
library(ggplot2)
library(gridExtra)
library(grid)
library(plyr)
library(purrr)
library(dplyr)
#install.packages('flextable')
library(flextable)
library(tibble)
library(ComplexHeatmap)
library(RColorBrewer)
library(ggplotify)
library(here)

Load healthy subsample of AGP data:

# Load all AGP data
AGP <- read.csv(here("01_tidy_data", "AGP_all.csv.gz"), header = TRUE, sep = ",")

# Load healthy samples' table
all_healthy <- read.csv(here("01_tidy_data", "AGP_healthy.csv.gz"), header = TRUE, sep = ",")

all_healthy <- select(all_healthy, sample_id, shannon_entropy, chao1, menhinick, margalef, fisher_alpha, simpson, pielou_evenness, gini_index, strong, simpson, faith_pd, sex, race, host_age)

all_healthy$condition <- 'healthy'
all_healthy$qiita_study_id <- AGP$qiita_study_id[match(all_healthy$sample_id, AGP$sample_id)]

names(all_healthy)[names(all_healthy) == 'host_age'] <- 'age'

Load Inflammatory Bowel Disease data:

all_IBD <- read.csv(here("01_tidy_data", "IBD_all.csv.gz"), header = TRUE, sep = ",")

Load Longitudinal Chron’s disease study data:

CD_2 <- read.csv(here("01_tidy_data", "all_CD_2.csv.gz"), header = TRUE, sep = ",")

Load Changes following fecal microbial transplantation for recurrent CDI data:

C_diff_trans <- read.csv(here("01_tidy_data", "all_C_diff_trans.csv.gz"), header = TRUE, sep = ",")

Load Fecal transplant - CDI with underlying IBD data:

trans_IBD_CDI <- read.csv(here("01_tidy_data", "all_trans_IBD_CDI.csv.gz"), header = TRUE, sep = ",")

Let’s define vector of names of the alpha diversity metrics that are going to be analysed:

metric <- c("shannon_entropy", "chao1", "menhinick", "margalef", "fisher_alpha", "simpson", "gini_index", "strong", "pielou_evenness", "faith_pd" ) 

Healthy samples vs IBD samples

#merge two datasets
healthy_disease <-  rbind.fill(all_healthy, all_IBD)

healthy_disease$condition <- as.factor(healthy_disease$condition)
healthy_disease$condition <- relevel(healthy_disease$condition, "healthy")

table(healthy_disease$condition)
## 
## healthy      CD      UC 
##     600      24      40
nrow(healthy_disease)
## [1] 664

Distributions of metrics in disease datasets

Generate a vector of 10 random colors for histograms:

qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
colors <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
histo_IBD <- vector('list', length(metric))

for (i in 1:length(metric)){
  histo_IBD[[i]] <- all_IBD %>% ggplot(aes(x = .data[[metric[i]]])) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
    geom_density(alpha=.2, fill=colors[i]) +
    xlab(label = metric[i]) + 
    ylab(label = "density")
}

grid.arrange(histo_IBD[[1]], histo_IBD[[2]],histo_IBD[[3]], histo_IBD[[4]],histo_IBD[[5]], histo_IBD[[6]],histo_IBD[[7]], histo_IBD[[8]],histo_IBD[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))

Distributions of metrics in American Gut Project dataset

histo_healthy <- vector('list', length(metric))

for (i in 1:length(metric)){
  histo_healthy[[i]] <- all_healthy %>% ggplot(aes(x = .data[[metric[i]]])) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30)+
    geom_density(alpha=.2, fill=colors[i]) +
    xlab(label = metric[i]) + 
    ylab(label = "density") 
}

grid.arrange(histo_healthy[[1]], histo_healthy[[2]], histo_healthy[[3]], histo_healthy[[4]], histo_healthy[[5]], histo_healthy[[6]], histo_healthy[[7]], histo_healthy[[8]], histo_healthy[[9]], histo_IBD[[10]], ncol=3, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in healthy dataset", gp=gpar(fontsize=10,font=2))) 

Compaisons between two datasets

Density plots and Box plots

density <- vector('list', length(metric))
box <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  
  density[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
    geom_density()+
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])

  box[[i]] <- healthy_disease %>% ggplot(aes(x = .data[[metric[i]]], color = condition)) +
    geom_boxplot() +
    labs(x = metric[i])
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index" &  metric[i] != "menhinick"){
    density [[i]] <- density [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
    
    box [[i]] <- box [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

# Show plots
for (j in 1:length(metric)){
  grid.arrange(density[[j]], box[[j]], ncol=2, top = paste("Density and box plot comparing healthy vs diseases data for metric:", metric[j], sep=" "))
}

Explore differences in distribution shape and mean values of groups with different conditions by ploting violin plot:

violin <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  plot_data <- healthy_disease %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("")+
    #ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
   violin [[i]] <- violin [[i]] + 
     scale_x_continuous(trans = 'log10') +
     xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

# Show plots

#violin

grid.arrange(violin[[1]], violin[[2]],violin[[3]], violin[[4]],violin[[5]], violin[[6]],violin[[7]], violin[[8]],violin[[9]], violin[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2))) 

Mann-Whitney-Wilcoxon Test

Test whether different conditions separate into distinct distributions with Mann-Whitney-Wilcoxon test:

test_helathy_IBD <- list()

for (i in 1:length(metric)){
  test_helathy_IBD[[i]] <- pairwise.wilcox.test(healthy_disease[[metric[i]]], healthy_disease$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_helathy_IBD[[i]]$p.value <- round(test_helathy_IBD[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test_helathy_IBD)

table1 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

table2 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

Orered by parameter:

table1

Ordered by signifficance:

table2

An FDR-adjusted p-value (also called q-value) of 0.05 indicates that 5% of significant tests will result in false positives. In other words, an FDR of 5% means that, among all results called significant, only 5% of these are truly null.

Now lets see which parameters show significant differenc between healthy and unhealthy (all conditions except healthy) samples:

wilcox_healthy_disease <- healthy_disease %>%
  summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
            Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
            Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
            Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
            Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
            Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
            Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
            Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
            Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value) 
wilcox_healthy_disease <- t(wilcox_healthy_disease)
colnames(wilcox_healthy_disease) <- c("p.value")
wilcox_healthy_disease <- data.frame(Alpha_Metric = row.names(wilcox_healthy_disease), wilcox_healthy_disease)
wilcox_healthy_disease$p.value <- round(wilcox_healthy_disease$p.value, digits = 17)

wilcox_healthy_disease %>%
  add_column(p.adjusted = p.adjust(wilcox_healthy_disease$p.value, "fdr"), .after='p.value') %>%
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
  bold(~ p.adjusted < 0.05, 3) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")

There is significant difference between healthy samples and those that are not healthy in all alpha metrics except from Pielou’s evenness.

Check the correlation between alpha diversity measures and condition with Kruskal-Wallis Test

Kruskal-Wallis Testis a non-parametric method for testing whether samples originate from the same distribution. It is used for comparing two or more independent samples of equal or different sample sizes. It extends the Mann–Whitney U test, which is used for comparing only two groups (source: Wikipedia)

kruskal_results <- healthy_disease %>%
  summarise(Shannon = kruskal.test(healthy_disease$shannon_entropy ~ healthy_disease$condition)$p.value,
  Chao1 = kruskal.test(healthy_disease$chao1 ~ healthy_disease$condition)$p.value,
  Fisher = kruskal.test(healthy_disease$fisher_alpha ~ healthy_disease$condition)$p.value,
  Margalef =kruskal.test(healthy_disease$margalef ~ healthy_disease$condition)$p.value,
  Simpson = kruskal.test(healthy_disease$simpson ~ healthy_disease$condition)$p.value,
  Menhinick = kruskal.test(healthy_disease$menhinick ~ healthy_disease$condition)$p.value,
  Pielou = kruskal.test(healthy_disease$pielou_evenness ~ healthy_disease$condition)$p.value,
  Gini = kruskal.test(healthy_disease$gini_index ~ healthy_disease$condition)$p.value,
  Strong = kruskal.test(healthy_disease$strong ~ healthy_disease$condition)$p.value,
  Faith = kruskal.test(healthy_disease$faith_pd ~ healthy_disease$condition)$p.value)

kruskal_results <- as.data.frame(t(kruskal_results))
colnames(kruskal_results) <- c("p.value")
kruskal_results <- data.frame(Alpha_Metric = row.names(kruskal_results), kruskal_results)
kruskal_results$p.value <- round(kruskal_results$p.value, digits = 17)


kruskal_results %>% 
  add_column(p.adjusted = p.adjust(kruskal_results$p.value, "fdr"), .after='p.value') %>%
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
   bold(~ p.adjusted < 0.05, 3) %>%
  add_header_lines(values = "Results of the Kruskal-Wallis test for differentiation of different parameters across different conditions")

This comparison shows us that there IS a difference between alpha diversity of healthy and IBD samples. The downside of this analysis is the small sample size of IBD dataset (UC and CD).

Longitudinal Chron’s disease analysis

Since previous analysis consisted from small sample size for IBD dataset and Chron’s disease samples showed unexpectedly high alpha diversity in various indexes, lets incorporate another study, this time only with CD patients. The sample size of this data set is 688. The number of patients and controls is balanced and controls are close relatives of patients. each patietn was sampled multiple times during the period of 6 weeks. Some patients previously underwent a surgical intervention on some part of their dygestive system.

table(CD_2$description, CD_2$condition)
##                           
##                            control crohns
##   father family 01-00010        27      0
##   father family 01-00011        20      0
##   father family 01-00012        54      0
##   father family 01-00013        27      0
##   fecal sample index pt 20       0     20
##   fecal sample index pt 21       0     25
##   fecal sample index pt 23       0     21
##   fecal sample index pt 24       0     26
##   fecal sample index pt 25       0     14
##   fecal sample index pt 26       0     16
##   fecal sample index pt 27       0     14
##   fecal sample index pt 31       0     10
##   fecal sample index pt 32       0     22
##   fecal sample index pt 33       0     20
##   fecal sample index pt 34       0      2
##   fecal sample index pt 35       0     14
##   fecal sample index pt 36       0      9
##   fecal sample index pt 37       0     22
##   fecal sample index pt 39       0     26
##   index family 01-00012          0     48
##   index family 01-00013          0     26
##   mother family 01-00010        25      0
##   mother family 01-00011        23      0
##   mother family 01-00012        57      0
##   mother family 01-00013        27      0
##   sibling family 01-00010       18      0
##   sibling family 01-00012       55      0
##   sibling family 01-00013       20      0
table(CD_2$condition)
## 
## control  crohns 
##     353     335
table(CD_2$surgery_and_ibd)
## 
##          control           crohns crohns (surgery) 
##              353              173              162

Distribution and mean differences between conditions

Lets do Violin plot to see the difference in alpha diversity between cases and controls in this study:

violin_CD_2a <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_2a [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2a [[i]] <- violin_CD_2a [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }
}

#plots for Shannon entropy
violin_CD_2a 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

grid.arrange(violin_CD_2a[[1]], violin_CD_2a[[2]],violin_CD_2a[[3]], violin_CD_2a[[4]],violin_CD_2a[[5]], violin_CD_2a[[6]],violin_CD_2a[[7]], violin_CD_2a[[8]],violin_CD_2a[[9]],violin_CD_2a[[10]], ncol=4, top = textGrob("Distributions of 1) Shannon's index  2) Chao1 3) Menhinick's index \n4) Margalef's index 5) Fisher's index 6) Simpson 7) Gini index 8) Strong's index 9) Pielou's evenness \nand 10) Faith's PD in IBD datasets", gp=gpar(fontsize=10,font=2)))

Lets now compare alpha diversity between cases and controls but taking into account if they underwent surgery:

violin_CD_2b <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(surgery_and_ibd) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_2b [[i]] <- CD_2 %>% ggplot(aes(x = .data[[metric[i]]], y = surgery_and_ibd, color = surgery_and_ibd, fill = surgery_and_ibd)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_and_ibd), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2b [[i]] <- violin_CD_2b [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }
}

#plots for Shannon entropy
violin_CD_2b 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Lets plot differences in effect of different surgical interventions:

violin_CD_2_surg <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))
  plot_data <- CD_2 %>% dplyr::group_by(surgery_type) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin_CD_2_surg [[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(surgery_type, -m), color = surgery_type, fill = surgery_type)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = surgery_type), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    theme(legend.position="none") 
  
    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_2_surg [[i]] <- violin_CD_2_surg [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
    }
}

#plots for Shannon entropy
violin_CD_2_surg 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Finally, lets compare cases and control from this study with cases from previous IBD studies and AGP controls:

CD_1 <- healthy_disease[healthy_disease$condition != "UC",]

CD_2_surg <- CD_2
CD_2_surg$condition <- NULL
names(CD_2_surg)[names(CD_2_surg) == 'surgery_and_ibd'] <- 'condition'

#CD_check <- rbind.fill(CD_1, CD_2)
CD_check <- rbind.fill(CD_1, CD_2_surg)

CD_check$condition[CD_check$condition == "CD"] <-'CD_1'
CD_check$condition[CD_check$condition == "crohns"] <-'CD_2'
CD_check$condition[CD_check$condition == "crohns (surgery)"] <-'CD_2_surgery'
CD_check$condition[CD_check$condition == "healthy"] <-'control(AGP)'
CD_check$condition[CD_check$condition == "control"] <-'control_2'
violin_CD_check <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- CD_check %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_CD_check [[i]] <- CD_check %>% ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    scale_y_discrete(limits=c("CD_2", "control_2", "CD_2_surgery", "CD_1", "control(AGP)")) +
    labs(x = metric[i])+
    ylab("")  +
    theme(legend.position="none") 

    if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_CD_check [[i]] <- violin_CD_check [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_CD_check 
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Here we can see that alpha diversity of CD samples from previous data set have similar mean values as CD_2 samples for all alpha indexes except from Faith’s PD which is much higher and Gini index which is much lower than in this longitudinal study. On the other side, CD samples with past surgery had much lower mean in all measures.

This probably mean that high diversity of CD samples is not a mistake, those are probably just samples with better clinical status.

When it comes to samples with previous surgery, it is unclear whether the lower alpha diversity comes from the severity of CD symptoms or as a direct consequence of surgery (no info about the time that passed after surgery).

test_CD_1 <- list()
test_CD_2 <- list()

for (i in 1:length(metric)){
  test_CD_1[[i]] <- pairwise.wilcox.test(CD_1[[metric[i]]], CD_1$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_1[[i]]$p.value <- round(test_CD_1[[i]]$p.value, digits = 17)
  
  test_CD_2[[i]] <- pairwise.wilcox.test(CD_2[[metric[i]]], CD_2$surgery_and_ibd, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_2[[i]]$p.value <- round(test_CD_2[[i]]$p.value, digits = 17)
}

tests1 <- do.call(what = rbind, args = test_CD_1)
tests2 <- do.call(what = rbind, args = test_CD_2)

table_CD_1 <- tests1 %>% 
  add_column(p.adjusted = p.adjust(tests1$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions in CD_1 dataset")

table_CD_2 <- tests2 %>% 
  add_column(p.adjusted = p.adjust(tests2$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcoxon test for distributions of different conditions in CD_2 dataset")

Results of the Wilcoxon test for diversity distributions of healthy and CD samples in first study:

table_CD_1

This table shows that, in first study, only four indexes show signifficant difference between healthy and CD samples (Gini, Faith, Menhinick, Strong).

Results of the Wilcoxon test for diversity distributions of healthy and CD samples in second (longitudinal) study:

table_CD_2

All alpha indexes show significant difference between healthy and crohn’s (surgery) samples in longitudinal Crohn’s study. Helathy and CD samples without surgery are different in five indexes (Strong, Simpson, Pielou, Shannon, Gini).

CD_check_w <- CD_check %>% filter(CD_check$condition != "control(AGP)" & CD_check$condition != "control_2" )

test_CD_3 <- list()

for (i in 1:length(metric)){
  test_CD_3[[i]] <- pairwise.wilcox.test(CD_check_w[[metric[i]]], CD_check_w$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test_CD_3[[i]]$p.value <- round(test_CD_3[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test_CD_3)

table_CD_3 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different groups of Crhohn's disease patients")

table_CD_3

This table confirms that CD samples (without surgery) from two studies only differ in Faith’s PD and Gini index.

Short- and Longterm changes after fecal transplantation for recurrent CDI

With this data set of rnrow(C_diff_trans)` samples we will explore whether there FMT for recurrent CDI affects the improvement of alpha diversity. Samples from 4 patients were collected in multiple time points after transplantation

table(C_diff_trans$disease_state)
## 
## post-FMT  Pre-FMT 
##      106        5
table(C_diff_trans$day_since_fmt)
## 
## -18  -2  -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  35  36  40 
##   1   1   1   2   3   3   2   2   2   3   3   4   4   3   4   4   3   3   3   3   4   4   2   4   3   3   2   3   1   4   2   1   1   1   1   1   1   1   1 
##  42  47  49  55  56  63  70  77  84 151 
##   2   1   2   1   2   2   2   2   2   1
table(C_diff_trans$animations_subject)
## 
## CD1 CD2 CD3 CD4 
##  36  23  16  36
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD1"] <-'subject_1'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD2"] <-'subject_2'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD3"] <-'subject_3'
C_diff_trans$animations_subject[C_diff_trans$animations_subject == "CD4"] <-'subject_4'
progression <- vector('list', length(metric))

for (i in 1:length(metric)){
  progression[[i]] <- C_diff_trans %>% ggplot(aes(x=day_since_fmt, y= .data[[metric[i]]], group=animations_subject)) +
    geom_line(aes(color=animations_subject))+
    geom_point(aes(color=animations_subject))+
    facet_wrap(vars(animations_subject), scale="free", ncol=2)
}

progression
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Even though the value is fluctuating as the time passes, we can see general trend of improvement of alpha diversity after FMT in all subjects.

Fecal transplant data analysis

From the paper: “A recent study suggests significantly lower response of CDI to FMT in patients with underlying inflammatory bowel disease (IBD)”.

The original study is looking at the changes in community composition (taxonomical) in patients after FMT. Lets examine what happens with alpha diversity as the time is passing after transplantation:

table(trans_IBD_CDI$condition)
## 
##      CDI CDI + CD CDI + UC   Donors 
##       28        6        6       28
table(trans_IBD_CDI$day_since_fmt)
## 
##       -1       28        7 NA-Donor 
##       14       11       14       29
trans_IBD_CDI_1 <- trans_IBD_CDI %>%
  filter(day_since_fmt != "no_data" ) %>%
  filter(!(donor_or_patient == "Donor" & condition=="CDI")) 

trans_IBD_CDI_1$day_since_fmt <- as.factor(trans_IBD_CDI_1$day_since_fmt)

violin_trans <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- trans_IBD_CDI_1 %>% dplyr::group_by(condition, day_since_fmt) %>% dplyr::summarise(grp_mean = mean(.data[[metric[i]]]))

  violin_trans[[i]] <- trans_IBD_CDI_1 %>%  
    mutate(across(day_since_fmt, factor, levels=c("-1","7","28","NA-Donor"))) %>% 
    ggplot(aes(x = .data[[metric[i]]], y = condition, color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("") +
    facet_wrap(vars(day_since_fmt), nrow=1)+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_trans [[i]] <- violin_trans [[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_trans
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

These plots show consistent trend of improvement of alpha diversity in 7th and 28th day after fecal microbiota transplantation. For all alpha indexes we can see that the diversity is increasing toward donor’s mean value. However, samples with underlying CD show a trend of diversity decrease in 28th day compared to 7th day. This shows that IBD can decrease the efficiency of FMT in CDI recepients.

cond <- c("CDI + UC", "CDI", "CDI + CD")
test_CDI_trans <- list()
table <- list()

for (i in 1:length(cond)){
  trans_IBD_CDI_2 <- trans_IBD_CDI_1 %>%
    filter(condition == cond[i])
  
  for (j in 1:length(metric)){
    test_CDI_trans[[j]] <- pairwise.wilcox.test(trans_IBD_CDI_2[[metric[j]]], trans_IBD_CDI_2$day_since_fmt, p.adjust.method="none") %>% 
    broom::tidy() %>% add_column(parameter = metric[j], .before='group1')
    test_CDI_trans[[j]]$p.value <- round(test_CDI_trans[[j]]$p.value, digits = 17)
  }
  
  tests <- do.call(what = rbind, args = test_CDI_trans)

  table <- tests %>%
    add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
    flextable() %>%
    bold(~ p.value < 0.05, 4) %>%
    bold(~ p.adjusted < 0.05, 5) %>%
    add_header_lines(values = paste("Results of the Wilcox test for condition:", cond[i], sep = " "))
   
 print(table)
 
 test_CDI_trans <- list()
}

AGP + IBD + transplant CDI & IBD + longitudinal CD

Now, lets merge all data sets with IBD and AGP as control:

before_trans <- trans_IBD_CDI %>%
  filter(day_since_fmt != c("7","28", "no_data", "NA-Donor"),
         condition != "Donors")

CD_2_merge <- CD_2 %>%
  filter(condition != "not applicable")

CD_2_merge$condition[CD_2_merge$condition=="control"] <- "healthy"
CD_2_merge$condition[CD_2_merge$condition=="crohns"] <- "CD"

healthy_disease_2 <- rbind.fill(healthy_disease, before_trans, CD_2_merge)
#healthy_disease_2 <- rbind.fill(healthy_disease, before_trans)

# Sizes of each dataset
table(healthy_disease_2$condition)
## 
##       CD      CDI CDI + CD CDI + UC  healthy       UC 
##      359       23        5        4      953       40
ggplot(healthy_disease_2, aes(x = faith_pd)) +
    geom_histogram(aes(y=..density..), colour="black", fill="white", bins=30) +
    geom_density (alpha=.2, fill="#009E73") +
    xlab(label = "faith_pd") + 
    ylab(label = "density") +
    scale_x_continuous(trans = 'log10') +
    facet_wrap(vars(condition), scales = "free_y")

Distribution of Faith’s PD in different groups shows modality in most of the IBD groups. *The values on y scale are densities instead of frequencies (what percentage of observations in a dataset fall between different values)

violin_all <- vector('list', length(metric))

for (i in 1:length(metric)){
  mean_line <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::summarise(grp_mean=mean(.data[[metric[i]]]))
  plot_data <- healthy_disease_2 %>% dplyr::group_by(condition) %>% dplyr::mutate(m = mean(.data[[metric[i]]])) 

  violin_all[[i]] <- plot_data %>% ggplot(aes(x = .data[[metric[i]]], y = reorder(condition, -m), color = condition, fill = condition)) +
    geom_violin()+
    geom_boxplot(width=0.1, color="grey", alpha=0.2) +
    geom_vline(data = mean_line, aes(xintercept = grp_mean, color = condition), linetype = "dashed")+ 
    labs(x = metric[i])+
    ylab("")+
    ggtitle("A Violin plot of distributions of alpha metrics in different conditions")+
    theme(legend.position="none") 
  
  if(metric[i] != "shannon_entropy" & metric[i] !="strong" & metric[i] != "gini_index"  &  metric[i] != "menhinick"){
     violin_all[[i]] <- violin_all[[i]] + 
       scale_x_continuous(trans = 'log10') +
       xlab(paste(metric[i], "(log10)", sep = " ")) 
  }
}

#plots for Shannon entropy
violin_all
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

Mann-Whitney-Wilcoxon Test

Non-parametric test (does not assume normal distribution)

test <- list()

for (i in 1:length(metric)){
  test[[i]] <- pairwise.wilcox.test(healthy_disease_2[[metric[i]]], healthy_disease_2$condition, p.adjust.method="none") %>% 
  broom::tidy() %>% add_column(parameter = metric[i], .before='group1')
  test[[i]]$p.value <- round(test[[i]]$p.value, digits = 17)
}

tests <- do.call(what = rbind, args = test)

table1 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(p.value, parameter)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")

table2 <- tests %>% 
  add_column(p.adjusted = p.adjust(tests$p.value, "fdr"), .after='p.value') %>%
  arrange(parameter, group1)  %>%
  flextable() %>% 
  bold(~ p.value < 0.05, 4) %>%
  bold(~ p.adjusted < 0.05, 5) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different conditions")
#table1
#table2
# Do Wilcox test to see which parameters show significant differenc between healthy and unhealthy samples
wilcox_p_value <- healthy_disease_2 %>%
  summarise(Shannon = wilcox.test(shannon_entropy[condition == "healthy"], shannon_entropy[condition != "healthy"])$p.value,
            Chao1 = wilcox.test(chao1[condition == "healthy"], chao1[condition != "healthy"])$p.value,
            Menhinick = wilcox.test(menhinick[condition == "healthy"], menhinick[condition != "healthy"])$p.value,
            Margalef = wilcox.test(margalef[condition == "healthy"], margalef[condition != "healthy"])$p.value,
            Simpson = wilcox.test(simpson[condition == "healthy"], simpson[condition != "healthy"])$p.value,
            Fisher = wilcox.test(fisher_alpha[condition == "healthy"], fisher_alpha[condition != "healthy"])$p.value,
            Pielou = wilcox.test(pielou_evenness[condition == "healthy"], pielou_evenness[condition != "healthy"])$p.value,
            Gini = wilcox.test(gini_index[condition == "healthy"], gini_index[condition != "healthy"])$p.value,
            Strong = wilcox.test(strong[condition == "healthy"], strong[condition != "healthy"])$p.value,
            Faith = wilcox.test(faith_pd[condition == "healthy"], faith_pd[condition != "healthy"])$p.value) 
wilcox_p_value <- t(wilcox_p_value)
colnames(wilcox_p_value) <- c("p.value")
wilcox_p_value <- data.frame(Alpha_Metric = row.names(wilcox_p_value), wilcox_p_value)
wilcox_p_value$p.value <- round(wilcox_p_value$p.value, digits = 17)

wilcox_p_value %>%
  flextable() %>%
  bold(~ p.value < 0.05, 2) %>%
  add_header_lines(values = "Results of the Wilcox test for distributions of different parameters between healthy and unhealthy samples")

Conclusions

Based on previous analysis we can draw following conclusions:

Session information

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8   
##  [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rmarkdown_2.17             knitr_1.40                 ggplotify_0.1.0            ComplexHeatmap_2.10.0      data.table_1.14.4         
##  [6] lubridate_1.8.0            forcats_0.5.2              readr_2.1.4                tidyr_1.2.1                tidyverse_1.3.2.9000      
## [11] plotly_4.10.1              shiny_1.7.3                here_1.0.1                 corrplot_0.92              PerformanceAnalytics_2.0.4
## [16] xts_0.12.1                 zoo_1.8-9                  nortest_1.0-4              flextable_0.8.2            cowplot_1.1.1             
## [21] plyr_1.8.7                 CGPfunctions_0.6.3         gridExtra_2.3              RColorBrewer_1.1-3         stringr_1.4.1             
## [26] ggplot2_3.4.0              tibble_3.1.8               purrr_0.3.5                dplyr_1.0.10              
## 
## loaded via a namespace (and not attached):
##   [1] circlize_0.4.15        readxl_1.4.1           uuid_1.1-0             backports_1.4.1        systemfonts_1.0.4      lazyeval_0.2.2        
##   [7] splines_4.1.2          TH.data_1.1-0          digest_0.6.30          yulab.utils_0.0.5      foreach_1.5.2          htmltools_0.5.3       
##  [13] rsconnect_0.8.25       fansi_1.0.3            memoise_2.0.1          magrittr_2.0.3         cluster_2.1.2          doParallel_1.0.17     
##  [19] paletteer_1.5.0        tzdb_0.3.0             modelr_0.1.9           matrixStats_0.61.0     officer_0.4.4          sandwich_3.0-1        
##  [25] colorspace_2.0-3       ggrepel_0.9.1          textshaping_0.3.6      xfun_0.37              crayon_1.5.2           jsonlite_1.8.3        
##  [31] libcoin_1.0-9          Exact_3.2              lme4_1.1-28            iterators_1.0.14       survival_3.2-13        glue_1.6.2            
##  [37] gtable_0.3.1           emmeans_1.8.2          MatrixModels_0.5-0     GetoptLong_1.0.5       sjstats_0.18.2         sjmisc_2.8.9          
##  [43] shape_1.4.6            BiocGenerics_0.40.0    scales_1.2.1           mvtnorm_1.1-3          DBI_1.1.3              Rcpp_1.0.8            
##  [49] viridisLite_0.4.1      xtable_1.8-4           performance_0.10.1     clue_0.3-62            gridGraphics_0.5-1     proxy_0.4-26          
##  [55] Formula_1.2-4          stats4_4.1.2           datawizard_0.6.4       htmlwidgets_1.5.4      httr_1.4.4             ellipsis_0.3.2        
##  [61] pkgconfig_2.0.3        farver_2.1.1           sass_0.4.2             utf8_1.2.2             tidyselect_1.2.0       labeling_0.4.2        
##  [67] rlang_1.0.6            later_1.3.0            cachem_1.0.6           munsell_0.5.0          cellranger_1.1.0       tools_4.1.2           
##  [73] cli_3.4.1              generics_0.1.3         sjlabelled_1.2.0       broom_1.0.1            evaluate_0.17          fastmap_1.1.0         
##  [79] ragg_1.2.1             yaml_2.3.6             rematch2_2.1.2         zip_2.2.0              ggmosaic_0.3.3         rootSolve_1.8.2.3     
##  [85] pbapply_1.6-0          nlme_3.1-155           mime_0.12              xml2_1.3.3             compiler_4.1.2         rstudioapi_0.14       
##  [91] png_0.1-7              e1071_1.7-9            bslib_0.4.1            DescTools_0.99.47      stringi_1.7.8          highr_0.9             
##  [97] gdtools_0.2.4          lattice_0.20-45        Matrix_1.4-0           nloptr_2.0.0           vctrs_0.5.0            pillar_1.8.1          
## [103] lifecycle_1.0.3        GlobalOptions_0.1.2    jquerylib_0.1.4        estimability_1.4.1     insight_0.18.8         lmom_2.9              
## [109] httpuv_1.6.5           R6_2.5.1               promises_1.2.0.1       IRanges_2.28.0         BayesFactor_0.9.12-4.4 gld_2.6.6             
## [115] codetools_0.2-18       boot_1.3-28            MASS_7.3-55            assertthat_0.2.1       fontawesome_0.4.0      rjson_0.2.21          
## [121] rprojroot_2.0.2        withr_2.5.0            S4Vectors_0.32.4       multcomp_1.4-18        bayestestR_0.13.0      expm_0.999-6          
## [127] parallel_4.1.2         hms_1.1.2              quadprog_1.5-8         rpart_4.1.16           coda_0.19-4            class_7.3-20          
## [133] minqa_1.2.4            inum_1.0-4             partykit_1.2-16        base64enc_0.1-3